#importing python libraries
import numpy as np
import pandas as pd
df = pd.read_csv('insurance.csv')
df.head()
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 |
| 1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 |
| 2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 |
| 3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 |
| 4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1338 entries, 0 to 1337 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 1338 non-null int64 1 sex 1338 non-null object 2 bmi 1338 non-null float64 3 children 1338 non-null int64 4 smoker 1338 non-null object 5 region 1338 non-null object 6 charges 1338 non-null float64 dtypes: float64(2), int64(2), object(3) memory usage: 73.3+ KB
df.describe()
| age | bmi | children | charges | |
|---|---|---|---|---|
| count | 1338.000000 | 1338.000000 | 1338.000000 | 1338.000000 |
| mean | 39.207025 | 30.663397 | 1.094918 | 13270.422265 |
| std | 14.049960 | 6.098187 | 1.205493 | 12110.011237 |
| min | 18.000000 | 15.960000 | 0.000000 | 1121.873900 |
| 25% | 27.000000 | 26.296250 | 0.000000 | 4740.287150 |
| 50% | 39.000000 | 30.400000 | 1.000000 | 9382.033000 |
| 75% | 51.000000 | 34.693750 | 2.000000 | 16639.912515 |
| max | 64.000000 | 53.130000 | 5.000000 | 63770.428010 |
df.size
9366
EDA (Exploration Data Analysis)
s_s = df.groupby(by=['sex','smoker']).mean()
s_s
| age | bmi | children | charges | ||
|---|---|---|---|---|---|
| sex | smoker | ||||
| female | no | 39.691042 | 30.539525 | 1.087751 | 8762.297300 |
| yes | 38.608696 | 29.608261 | 1.008696 | 30678.996276 | |
| male | no | 39.061896 | 30.770580 | 1.092843 | 8087.204731 |
| yes | 38.446541 | 31.504182 | 1.188679 | 33042.005975 |
s_s = pd.pivot_table(data=df, columns='sex', values="age", index='children')
s_s
| sex | female | male |
|---|---|---|
| children | ||
| 0 | 38.346021 | 38.543860 |
| 1 | 39.506329 | 39.403614 |
| 2 | 40.512605 | 38.396694 |
| 3 | 42.181818 | 40.975000 |
| 4 | 42.000000 | 36.642857 |
| 5 | 37.000000 | 34.500000 |
s_s = pd.pivot_table(data=df, columns='region', values="charges", index='smoker', aggfunc= 'max')
s_s
| region | northeast | northwest | southeast | southwest |
|---|---|---|---|---|
| smoker | ||||
| no | 32108.66282 | 33471.97189 | 36580.28216 | 36910.60803 |
| yes | 58571.07448 | 60021.39897 | 63770.42801 | 52590.82939 |
df[(df['smoker'] == 'yes') & (df["region"] == 'northeast') & (df['charges'] == 58571.07448)].head()
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 577 | 31 | female | 38.095 | 1 | yes | northeast | 58571.07448 |
df[(df['smoker'] == 'yes') & (df["region"] == 'northeast') & (df['charges'] > 5000) & (df['sex'] == 'male')].head()
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 38 | 35 | male | 36.670 | 1 | yes | northeast | 39774.27630 |
| 92 | 59 | male | 29.830 | 3 | yes | northeast | 30184.93670 |
| 98 | 56 | male | 19.950 | 0 | yes | northeast | 22412.64850 |
| 123 | 44 | male | 31.350 | 1 | yes | northeast | 39556.49450 |
| 157 | 18 | male | 25.175 | 0 | yes | northeast | 15518.18025 |
df[(df['smoker'] == 'yes') & (df["region"] == 'northeast') & (df['charges'] > 20000) & (df['sex'] == 'male')].head()
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 38 | 35 | male | 36.670 | 1 | yes | northeast | 39774.27630 |
| 92 | 59 | male | 29.830 | 3 | yes | northeast | 30184.93670 |
| 98 | 56 | male | 19.950 | 0 | yes | northeast | 22412.64850 |
| 123 | 44 | male | 31.350 | 1 | yes | northeast | 39556.49450 |
| 185 | 36 | male | 41.895 | 3 | yes | northeast | 43753.33705 |
EDA- Visualizations
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.countplot(data=df, x = 'sex')
<AxesSubplot:xlabel='sex', ylabel='count'>
sns.jointplot(data=df, x='age', y='charges')
<seaborn.axisgrid.JointGrid at 0x27d16712af0>
male = df[(df['charges'] >= 1600) & (df['sex'] == 'male')]
male.head()
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 |
| 2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 |
| 3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 |
| 4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 |
| 8 | 37 | male | 29.830 | 2 | no | northeast | 6406.41070 |
female = df[(df['charges'] >= 1600) & (df['sex'] == 'female')]
female.head()
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.90 | 0 | yes | southwest | 16884.92400 |
| 5 | 31 | female | 25.74 | 0 | no | southeast | 3756.62160 |
| 6 | 46 | female | 33.44 | 1 | no | southeast | 8240.58960 |
| 7 | 37 | female | 27.74 | 3 | no | northwest | 7281.50560 |
| 9 | 60 | female | 25.84 | 0 | no | northwest | 28923.13692 |
fig, ax = plt.subplots(figsize=(14, 5))
df[(df['charges'] >= 1600) & (df['sex'] == 'male')]['charges'].plot()
<AxesSubplot:>
#Barchart
fig, ax = plt.subplots(figsize=(140, 5))
df[(df['charges'] >= 1600) & (df['sex'] == 'male')]['charges'].plot(kind="bar")
<AxesSubplot:>
pd.DataFrame(df[(df['charges'] >= 1600) & (df['sex'] == 'female')]['charges']).plot()
<AxesSubplot:>
#Barchart
fig, ax = plt.subplots(figsize=(140, 5))
df[(df['charges'] >= 1600) & (df['sex'] == 'female')]['charges'].plot(kind='bar')
<AxesSubplot:>
#Partitioning into quartiles
pd.qcut(male['charges'], q=100)
1 (1710.039, 1771.385]
2 (4447.07, 4522.661]
3 (21230.311, 22030.743]
4 (3864.992, 4033.738]
8 (6397.702, 6599.067]
...
1324 (4239.035, 4407.564]
1325 (12960.454, 13143.685]
1327 (9331.882, 9504.273]
1329 (10220.693, 10423.281]
1333 (10586.342, 10739.625]
Name: charges, Length: 652, dtype: category
Categories (100, interval[float64, right]): [(1621.339, 1637.687] < (1637.687, 1694.95] < (1694.95, 1710.039] < (1710.039, 1771.385] ... (43916.891, 45907.758] < (45907.758, 47265.568] < (47265.568, 48674.519] < (48674.519, 62592.873]]
fig, ax = plt.subplots(figsize=(10, 8))
df['age'].value_counts().plot(kind='bar')
<AxesSubplot:>
From Samples
# Distribution of Age
import plotly.express as px
import plotly.graph_objects as go
fig = px.histogram(df,
nbins = 100,
x = 'age',
color='sex',
)
fig.update_layout(title = 'Distribution of Age',
height = 700)
fig.show()
Feature Engineering
df.head()
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 |
| 1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 |
| 2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 |
| 3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 |
| 4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 |
df['sex'] = df['sex'].astype('category')
df['sex'] = df['sex'].cat.codes
df['smoker'] = df['smoker'].astype('category')
df['smoker'] = df['smoker'].cat.codes
df.head()
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | 0 | 27.900 | 0 | 1 | southwest | 16884.92400 |
| 1 | 18 | 1 | 33.770 | 1 | 0 | southeast | 1725.55230 |
| 2 | 28 | 1 | 33.000 | 3 | 0 | southeast | 4449.46200 |
| 3 | 33 | 1 | 22.705 | 0 | 0 | northwest | 21984.47061 |
| 4 | 32 | 1 | 28.880 | 0 | 0 | northwest | 3866.85520 |
Training and Testing model
Linear Regression model
from sklearn.model_selection import train_test_split
X= df.drop(['region', 'charges', 'sex'], axis=1)
y= df['charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
LinearRegression()
lr.coef_
array([ 262.31060687, 331.74780875, 379.02452728, 23699.63589053])
X.columns
Index(['age', 'bmi', 'children', 'smoker'], dtype='object')
cdf = pd.DataFrame(lr.coef_, X.columns, columns=['Coeff'])
cdf
| Coeff | |
|---|---|
| age | 262.310607 |
| bmi | 331.747809 |
| children | 379.024527 |
| smoker | 23699.635891 |
predictions = lr.predict(X_test)
predictions
array([ 8.48006203e+03, 6.96853565e+03, 3.69728391e+04, 9.31049341e+03,
2.68392121e+04, 1.10958507e+04, 1.94880246e+01, 1.69290641e+04,
6.46901186e+02, 1.10744225e+04, 2.83669588e+04, 9.22723399e+03,
5.36873437e+03, 3.85188017e+04, 4.05796772e+04, 3.72708277e+04,
1.53365498e+04, 3.61254658e+04, 9.31712306e+03, 3.14643164e+04,
3.94794502e+03, 1.04186683e+04, 2.63531950e+03, 6.54233946e+03,
1.11476369e+04, 1.25725757e+04, 1.49492691e+04, 5.87087793e+03,
9.47742387e+03, 2.28675446e+03, 9.29501776e+03, 1.30758632e+04,
4.64285342e+03, 3.19104515e+03, 4.71781915e+03, 1.25754834e+04,
2.21197400e+03, 9.14392468e+03, 3.32901712e+04, 3.27656497e+04,
3.93329608e+03, 4.11643626e+03, 1.45112746e+04, 1.14889096e+04,
8.84056782e+03, 1.25446997e+04, 4.98953519e+03, 3.32259509e+03,
3.56153188e+04, 9.19176083e+03, 1.61342983e+04, 2.41033317e+03,
1.21787276e+04, 9.54787007e+02, 1.36777556e+04, 1.20922861e+04,
3.88190200e+03, 3.19434300e+04, 1.37611522e+04, 1.24993614e+04,
1.46414331e+04, 1.04986174e+04, 1.66804011e+04, 7.63009812e+03,
1.14491497e+04, 3.89388552e+03, 2.68164600e+04, 1.11016519e+04,
1.97237619e+03, 6.44115638e+03, 1.02043685e+04, 1.11060758e+04,
1.12143482e+04, 9.28317144e+03, 1.21132974e+04, 6.93536087e+03,
6.59574690e+03, 1.07650099e+04, 6.61670832e+03, 9.03325416e+03,
3.86965118e+03, 3.63851859e+04, 6.60435776e+03, 3.02223056e+04,
3.48590781e+04, 3.49393248e+04, 7.04595469e+03, 1.28664024e+04,
9.78337124e+03, 1.47477901e+04, 1.72255240e+04, 3.56806814e+04,
3.26404550e+04, 5.62713062e+03, 3.21611792e+04, 9.76134796e+03,
2.96363485e+04, 3.76017441e+03, 2.81198067e+04, 5.28966721e+03,
5.23792382e+03, 2.13769778e+03, 1.16771862e+04, 1.53688480e+04,
1.14147260e+04, 4.34442290e+03, 1.00473406e+04, 3.21019118e+04,
-5.89499042e+02, 3.31377970e+04, 3.53103858e+03, 1.02016104e+04,
1.39062946e+04, 3.11528957e+04, 1.10097264e+04, 4.13919893e+03,
1.27912496e+04, 3.21609921e+04, 8.24765860e+03, 3.05507311e+03,
7.89448438e+03, 1.04248436e+04, 1.46969661e+04, 5.75821557e+03,
3.57955713e+03, 1.00466511e+04, 1.10481571e+04, 1.06945660e+04,
1.47201333e+04, 7.50297920e+03, 5.55768811e+03, 9.26981420e+03,
9.40168130e+03, 1.20712267e+04, 8.67772637e+03, 1.57294786e+04,
7.91498613e+03, 3.22354002e+04, 3.57291643e+04, 3.11826105e+04,
5.92850627e+03, 1.21953525e+04, 6.13686774e+03, 1.44133217e+04,
2.66323835e+03, 3.33699332e+04, 6.08871975e+03, 5.35177463e+03,
1.41518431e+04, 7.24448851e+03, 3.85336804e+04, 2.92817692e+03,
5.93598036e+03, 3.11955632e+04, 1.13704620e+04, 7.98820048e+03,
1.44819643e+04, 1.00173408e+04, 2.72339919e+04, 3.30104037e+04,
1.40962228e+04, 1.47005797e+03, 1.36049564e+04, 1.73056733e+03,
5.58465547e+03, 1.16757627e+04, 4.02983811e+04, 3.65548868e+04,
3.35869055e+04, 4.17600836e+03, 7.86475717e+03, 8.95722656e+03,
1.19471010e+04, 4.68081734e+03, 2.32748355e+03, 3.24032553e+04,
2.54392363e+04, 1.77833147e+04, 2.63114088e+04, 1.02079034e+04,
3.69941729e+04, -9.67828744e+02, 6.76621225e+03, 8.04377090e+03,
3.77966768e+03, 4.92386983e+03, 5.38532176e+03, 4.62501709e+03,
1.53087183e+04, 1.11645361e+04, 7.04434582e+03, 1.96330035e+03,
1.02703214e+03, 3.21980438e+04, 1.67113096e+04, 1.22567258e+04,
1.06034943e+03, 1.20699831e+04, 1.01781909e+03, 9.17455883e+03,
1.76111415e+03, 3.39826921e+04, 1.08635399e+04, 2.35333917e+03,
2.57952683e+04, 2.65590651e+04, 9.44327455e+03, 1.53378488e+03,
1.34768682e+04, 1.40342870e+03, 1.09289453e+04, 1.08523262e+04,
1.63377387e+04, 2.70630920e+04, 7.10050069e+03, 4.70131908e+03,
5.98006969e+03, 1.34122255e+04, 1.14426947e+04, 8.47527289e+03,
4.92483905e+03, 1.22186247e+04, 1.38450762e+04, 3.59296472e+04,
3.98954007e+03, 2.91203880e+04, -6.15809029e+02, 2.69835158e+03,
1.12855368e+04, 1.56931841e+04, 5.29971408e+03, 6.93217348e+03,
4.09233733e+03, 3.17678656e+04, 7.28903221e+03, 1.26444111e+04,
5.38712301e+03, 9.89593035e+03, 3.64336741e+04, 4.54024143e+03,
9.38751707e+03, 3.14080243e+04, 5.56437294e+03, 4.69127932e+03,
9.01937196e+02, 4.69210604e+03, 4.79209537e+03, 6.78930450e+03,
1.86107688e+04, -1.65086219e+03, 2.63223186e+03, 1.09064801e+04,
3.47007504e+03, 9.89274296e+03, 3.48708465e+03, 5.19530616e+03,
1.28924327e+04, 6.01661182e+03, 7.88880623e+03, 6.95163640e+03,
8.71362707e+03, 1.05637929e+04, 2.80187555e+04, 3.94325193e+04,
1.17938877e+04, 7.21034451e+03, 4.11588569e+04, 1.26689251e+04,
7.00899034e+03, 7.96298981e+03, 9.45368666e+03, 1.10771057e+04,
9.99075639e+03, 1.78494344e+04, 1.39858968e+03, 2.31416529e+04,
1.21156955e+04, 3.26962125e+04, 4.68068014e+03, 1.33587307e+04,
1.03470575e+04, 1.74018047e+04, 1.00867886e+04, 1.13657531e+04,
3.25475282e+04, 2.82247730e+03, 1.38399949e+04, 3.95311795e+04,
4.98579549e+03, 5.94256544e+03, 2.87726556e+03, 1.18401026e+04,
2.50628878e+04, 1.34918075e+04, 9.33017376e+03, 9.70461593e+03,
1.36214137e+04, 1.33399150e+03, 2.64264397e+03, 3.08810496e+04,
3.04023136e+04, 1.36257503e+04, 3.45640063e+03, 2.52642420e+04,
1.36535694e+04, 3.09222383e+04, 2.96066219e+03, 3.92934151e+04,
1.12194295e+04, 4.98718162e+03, 7.07364370e+03, 2.62928673e+03,
2.58287603e+04, 1.48435695e+04, 8.51807987e+02, 1.30859634e+04,
1.29410456e+04, 1.48824598e+04, 3.51065955e+04, 1.42334385e+04,
3.19581111e+04, 1.02730716e+04, 1.84324892e+04, 6.23150851e+03,
9.20764628e+03, 9.64947305e+03, 1.54261645e+04, 9.50497037e+03,
7.83352795e+03, 1.53928918e+04, 1.24445731e+04, 1.43466871e+04,
7.74519787e+03, 2.62851932e+04, 9.45115845e+03, 1.70923356e+03,
4.46366504e+03, 1.44935381e+04, 3.59430045e+04, 9.97813322e+03,
1.27902804e+04, 4.91050548e+03, 4.77799874e+03, 4.15196461e+03,
2.27846076e+03, 8.92045459e+03, 7.29162272e+03, 2.68816931e+03,
1.33778142e+04, 8.83525671e+03, 6.20008510e+03, 1.07113389e+03,
9.88224352e+03, 5.13784716e+03, 3.28981243e+04, 2.86503253e+04,
3.72700830e+04, 5.99135838e+03, 8.84272369e+03, 8.61183649e+03,
3.85969875e+03, 3.13075682e+04, 6.64208653e+03, 2.87059759e+04,
3.59301069e+04, 7.31438539e+03, 1.34232292e+04, 9.62869871e+03,
8.29934680e+03, 1.22396841e+04, 2.98995286e+04, 1.73943929e+04,
1.16048591e+04, 3.81970194e+03, -7.73756276e+02, 1.18050392e+04,
3.12635217e+04, 1.32105934e+04, 1.16042445e+04, 7.71764605e+03,
3.07543766e+03, 7.50734612e+03, 7.72104375e+03, 1.09874786e+04,
3.36274974e+04, 3.96458998e+04, 1.24009862e+04, 8.27911946e+03,
1.62997125e+04, 1.53512540e+04, 9.80378566e+03, 9.59505894e+03,
8.69326253e+03, 3.00683248e+03, 1.04714629e+04, 4.18508420e+03,
1.11403446e+04, 1.55258706e+04, 6.86237635e+03, 1.64813993e+03,
1.47497607e+04, 4.89973037e+02, 1.39044933e+04, 8.78881202e+03,
1.34508860e+04, 3.58726854e+04, 3.36895299e+04, 3.56004722e+04,
6.15566619e+03, 5.54387119e+03, 1.66690696e+04, 7.72741673e+03,
3.78720253e+04, 5.28003905e+03, 7.99907758e+03, 1.04983305e+04,
3.04572320e+04, 4.69820461e+03, 3.26370188e+03, 1.61014032e+04,
3.17081781e+03, 6.30163522e+03, 9.75199240e+03, -5.82634248e+02,
2.96851468e+04, 7.94428932e+03, 1.02645979e+04, 5.89515152e+03,
7.89508657e+03, 1.18152268e+04, 2.92915606e+04, 9.75540252e+03,
1.10809595e+04, 6.00365907e+03, 3.92512716e+03, 1.35923962e+03,
8.09445242e+03, 1.13177049e+04, 1.06488981e+04, 8.94717438e+03,
5.73546352e+03, 4.40113719e+03])
y_test
764 9095.06825
887 5272.17580
890 29330.98315
1293 9301.89355
259 33750.29180
...
644 18806.14547
602 11070.53500
731 10065.41300
321 24671.66334
479 1824.28540
Name: charges, Length: 442, dtype: float64
plt.scatter(y_test, predictions)
<matplotlib.collections.PathCollection at 0x27d2684f1c0>
sns.distplot((y_test-predictions))
C:\Users\user\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
<AxesSubplot:xlabel='charges', ylabel='Density'>
Model Evaluation
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
print(mean_absolute_error(y_test, predictions))
print(mean_squared_error(y_test, predictions))
#print(sqrt(mean_squared_error(y_test, predictions)))
4210.867850540483 35299792.10029327
lr.score(X_test, y_test)
0.7591192126340146
Insurance Prediction
X.head()
| age | bmi | children | smoker | |
|---|---|---|---|---|
| 0 | 19 | 27.900 | 0 | 1 |
| 1 | 18 | 33.770 | 1 | 0 |
| 2 | 28 | 33.000 | 3 | 0 |
| 3 | 33 | 22.705 | 0 | 0 |
| 4 | 32 | 28.880 | 0 | 0 |
pred = lr.predict([[19, 27.9,0,1]])
print('value charge to frederick insurance = ', pred)
value charge to frederick insurance = [25505.58586529]
C:\Users\user\anaconda3\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
Logistic Regression (Predicting smoker or non-smoker)
df.head()
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | 0 | 27.900 | 0 | 1 | southwest | 16884.92400 |
| 1 | 18 | 1 | 33.770 | 1 | 0 | southeast | 1725.55230 |
| 2 | 28 | 1 | 33.000 | 3 | 0 | southeast | 4449.46200 |
| 3 | 33 | 1 | 22.705 | 0 | 0 | northwest | 21984.47061 |
| 4 | 32 | 1 | 28.880 | 0 | 0 | northwest | 3866.85520 |
from sklearn.model_selection import train_test_split
X= df.drop(['smoker', 'region'], axis=1)
y= df['smoker']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)
LogisticRegression()
predictions = logmodel.predict(X_test)
predictions
array([0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0], dtype=int8)
y_test
764 0
887 0
890 1
1293 0
259 1
..
644 0
602 0
731 0
321 0
479 0
Name: smoker, Length: 442, dtype: int8
Model Evaluation
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))
precision recall f1-score support
0 0.96 0.96 0.96 356
1 0.81 0.81 0.81 86
accuracy 0.93 442
macro avg 0.88 0.88 0.88 442
weighted avg 0.93 0.93 0.93 442
logmodel.score(X_test, y_test)
0.9276018099547512
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, predictions))
[[340 16] [ 16 70]]
My Predications
logmodel.score(X_test, y_test)
0.9276018099547512
X.head()
| age | sex | bmi | children | charges | |
|---|---|---|---|---|---|
| 0 | 19 | 0 | 27.900 | 0 | 16884.92400 |
| 1 | 18 | 1 | 33.770 | 1 | 1725.55230 |
| 2 | 28 | 1 | 33.000 | 3 | 4449.46200 |
| 3 | 33 | 1 | 22.705 | 0 | 21984.47061 |
| 4 | 32 | 1 | 28.880 | 0 | 3866.85520 |
y
0 1
1 0
2 0
3 0
4 0
..
1333 0
1334 0
1335 0
1336 0
1337 1
Name: smoker, Length: 1338, dtype: int8
logmodel.predict([[19,0,27.900,0,99999.92400]])
C:\Users\user\anaconda3\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names
array([1], dtype=int8)
Decision Tree
df.head()
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | 0 | 27.900 | 0 | 1 | southwest | 16884.92400 |
| 1 | 18 | 1 | 33.770 | 1 | 0 | southeast | 1725.55230 |
| 2 | 28 | 1 | 33.000 | 3 | 0 | southeast | 4449.46200 |
| 3 | 33 | 1 | 22.705 | 0 | 0 | northwest | 21984.47061 |
| 4 | 32 | 1 | 28.880 | 0 | 0 | northwest | 3866.85520 |
from sklearn.model_selection import train_test_split
X= df.drop(['smoker', 'region'], axis=1)
y= df['smoker']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
from sklearn.tree import DecisionTreeClassifier
dr = DecisionTreeClassifier()
dr.fit(X_train, y_train)
DecisionTreeClassifier()
predict = dr.predict(X_test)
predict
array([0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0], dtype=int8)
y_test
764 0
887 0
890 1
1293 0
259 1
..
644 0
602 0
731 0
321 0
479 0
Name: smoker, Length: 442, dtype: int8
dr.score(X_test, y_test)
0.9705882352941176
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))
precision recall f1-score support
0 0.96 0.96 0.96 356
1 0.81 0.81 0.81 86
accuracy 0.93 442
macro avg 0.88 0.88 0.88 442
weighted avg 0.93 0.93 0.93 442
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, predictions))
[[340 16] [ 16 70]]
My Predictions
X.head()
| age | sex | bmi | children | charges | |
|---|---|---|---|---|---|
| 0 | 19 | 0 | 27.900 | 0 | 16884.92400 |
| 1 | 18 | 1 | 33.770 | 1 | 1725.55230 |
| 2 | 28 | 1 | 33.000 | 3 | 4449.46200 |
| 3 | 33 | 1 | 22.705 | 0 | 21984.47061 |
| 4 | 32 | 1 | 28.880 | 0 | 3866.85520 |
dr.predict([[52,0,3,0, 500000]])
C:\Users\user\anaconda3\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names
array([1], dtype=int8)
Random Forest
from sklearn.model_selection import train_test_split
df.head()
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | 0 | 27.900 | 0 | 1 | southwest | 16884.92400 |
| 1 | 18 | 1 | 33.770 | 1 | 0 | southeast | 1725.55230 |
| 2 | 28 | 1 | 33.000 | 3 | 0 | southeast | 4449.46200 |
| 3 | 33 | 1 | 22.705 | 0 | 0 | northwest | 21984.47061 |
| 4 | 32 | 1 | 28.880 | 0 | 0 | northwest | 3866.85520 |
X = df.drop(['region', 'smoker'], axis=1)
y = df['smoker']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=20)
rf.fit(X_train, y_train)
RandomForestClassifier(n_estimators=20)
rf.score(X_test, y_test)
0.9660633484162896
predictions = rf.predict(X_test)
predictions
pd.DataFrame(predictions).head()
| 0 | |
|---|---|
| 0 | 0 |
| 1 | 0 |
| 2 | 1 |
| 3 | 0 |
| 4 | 1 |
y_test
764 0
887 0
890 1
1293 0
259 1
..
644 0
602 0
731 0
321 0
479 0
Name: smoker, Length: 442, dtype: int8
pd.DataFrame(y_test).head()
| smoker | |
|---|---|
| 764 | 0 |
| 887 | 0 |
| 890 | 1 |
| 1293 | 0 |
| 259 | 1 |
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))
precision recall f1-score support
0 0.99 0.97 0.98 356
1 0.89 0.94 0.92 86
accuracy 0.97 442
macro avg 0.94 0.96 0.95 442
weighted avg 0.97 0.97 0.97 442
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, predictions))
[[346 10] [ 5 81]]
cm = confusion_matrix(y_test, predictions)
cm
array([[346, 10],
[ 5, 81]], dtype=int64)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(10,8))
sns.heatmap(cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Truth')
Text(95.72222222222221, 0.5, 'Truth')
K Fold Cross Valuation
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
cross_val_score(LogisticRegression(), X,y)
array([0.92164179, 0.93283582, 0.93656716, 0.92883895, 0.94756554])
cross_val_score(SVC(), X,y)
array([0.90671642, 0.89925373, 0.94776119, 0.91011236, 0.92883895])
cross_val_score(RandomForestClassifier(), X,y)
array([0.95895522, 0.94776119, 0.97761194, 0.95505618, 0.98127341])
cross_val_score(RandomForestClassifier(n_estimators=40), X,y)
array([0.94776119, 0.94402985, 0.98134328, 0.95505618, 0.98876404])